################################################
#
# Analysis script for PINK1 data set in MADS course
#
# by C. AMeli and A. Skupin 2023/10 with updates 2025/11
#
################################################
# ---- HPC setup ----
.libPaths("~/Rlibs")
# ================================= LIBRARIES ==================================
library(Seurat)
library(dplyr)
library(tidyverse)
library(viridisLite)
library(viridis)
library(ggplot2)
library(openxlsx)
library(future)
plan("multicore", workers = 16, future.seed = TRUE) # adjust 16 to the number of cores you request
options(future.globals.maxSize = 50 * 1024^3) # 50 GB max size for objects
# ================================ PARAMETERS ==================================
# set your working directory accordingly
#setwd('/Users/alexander.skupin/projects/teaching/WS2025/MADS/project1')
input_file = "SeuratFinal.rds"
output_folder = "Results/"
#to export things later
writeExcel = TRUE
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
# ---- RELABELING ----
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
# Load mapped data-sets
# options(java.parameters = "-Xmx80g")
S = readRDS(input_file)
summary(S)
## Length Class Mode
## 1 Seurat S4
dim(S)
## [1] 2000 25655
head(colnames(S))
## [1] "WT_37_34" "WT_37_42" "WT_37_47" "WT_37_49" "WT_37_50" "WT_37_52"
tail(colnames(S))
## [1] "ND_0_2995" "ND_0_2996" "ND_0_2997" "ND_0_2998" "ND_0_2999" "ND_0_3000"
head(colnames(S))
## [1] "WT_37_34" "WT_37_42" "WT_37_47" "WT_37_49" "WT_37_50" "WT_37_52"
head(rownames(S))
## [1] "TTR" "CGA" "CARTPT" "TPH1" "ANKRD1" "SST"
head(S[[]])
## orig.ident nCount_RNA nFeature_RNA Condition Day ConditionDay
## WT_37_34 WT 1456 1190 WT 37 WT_37
## WT_37_42 WT 2680 2256 WT 37 WT_37
## WT_37_47 WT 2258 1913 WT 37 WT_37
## WT_37_49 WT 2643 2225 WT 37 WT_37
## WT_37_50 WT 2613 2176 WT 37 WT_37
## WT_37_52 WT 2158 1847 WT 37 WT_37
## nCount_RNA_imputed nFeature_RNA_imputed
## WT_37_34 717.537 20798
## WT_37_42 2093.000 20798
## WT_37_47 1353.826 20798
## WT_37_49 1852.341 20798
## WT_37_50 2232.950 20798
## WT_37_52 1198.178 20798
Assays(S)
## [1] "RNA" "integrated" "RNA_imputed"
# Convert meta-data to factors
S$Day = factor(S$Day, levels = c(0,18,25,37,57))
S$Condition = str_replace_all(S$Condition, "ND","PINK1")
S$Condition = str_replace_all(S$Condition,"WT","CTL")
S$ConditionDay = paste(S$Condition,S$Day,sep = " ")
S$ConditionDay = factor(S$ConditionDay, levels = c("CTL 0","PINK1 0",
"CTL 18","PINK1 18",
"CTL 25","PINK1 25",
"CTL 37","PINK1 37",
"CTL 57","PINK1 57"))
S$ConditionDay = factor(S$ConditionDay, levels = c("CTL 0","CTL 18","CTL 25","CTL 37","CTL 57",
"PINK1 0","PINK1 18","PINK1 25","PINK1 37","PINK1 57"))
S$Condition = factor(S$Condition, levels = c("CTL","PINK1"))
# now you can have a look into the object ... summary, dim, head etc ...
## to get that scaled data in
expression_matrix <- GetAssayData(S, layer = "scale.data")
tempa <- subset(S, ConditionDay %in% "CTL 0")
expression_matrix_CTL_0 <- GetAssayData(tempa, layer = "scale.data")
# do we have already PCA information?
FeaturePlot(S, features="nCount_RNA")
FeaturePlot(S, features="nCount_RNA",split.by='Day')
DimPlot(S, split.by='Day')
ElbowPlot(S,ndims = 30)
DimHeatmap(S, dims = 1:9, cells = 1000, balanced = TRUE)
######## new Dim reduction since the current is based on full data set
# if u run into memory issues - u can also put everything into S instead of S_new
S_new = ScaleData(S)
S_new = RunPCA(S_new,npcs = 100)
ElbowPlot(S_new,ndims = 30)
DimHeatmap(S_new, dims = 1:9, cells = 1000, balanced = TRUE)
# here u should play around with dims, n.neigbors check also ?RunUMAP - something for reduction=? ; what happens for other assays?
set.seed(18)
S_new = RunUMAP(S_new ,dims = 1:100,assay = "integrated",n.neighbors = 50, seed.use = NULL)
# We can store different parameter set in the object by defining reduction.name = "nameofreduction" -default is "umap"
DimPlot(object = S_new, reduction = "umap")
DimPlot(object = S_new, reduction = "umap", group.by = "Condition", split.by = "Day",shuffle = TRUE)
DimPlot(object = S_new, reduction = "umap", group.by = "Day", split.by = "Condition",shuffle = TRUE)
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
# ---- NORMALIZING ASSAYS BEFORE DEGs ----
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
S = NormalizeData(S,assay = "RNA")
S = NormalizeData(S,assay = "RNA_imputed")
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
# ---- DEGs DAYWISE ----
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
DefaultAssay(S) = "RNA" # for the different Assays
S = SetIdent(object = S, value = S$ConditionDay)
for(pink in c("PINK1")){
for(day in c(0,18,25,37,57)) {
print(paste(pink,day))
temp = FindMarkers(S,
assay = "RNA",
ident.1 = paste("CTL",day,sep=" "),
ident.2 = paste(pink,day,sep=" "),
logfc.threshold = 0.25,
min.pct = 0.05,
densify = TRUE,
test.use = "wilcox",
latent.vars = "nFeature_RNA",
layer = "data")
temp$Day = day
temp$Gene = rownames(temp)
temp$ident.1 = "CTL"
temp$ident.2 = pink
rownames(temp) = paste(pink,day,c(1:dim(temp)[1]),sep="_")
if(day==0 & pink == "PINK1"){
DEG = temp
}else{
DEG = rbind(DEG,temp)
}
}
}
## [1] "PINK1 0"
## [1] "PINK1 18"
## [1] "PINK1 25"
## [1] "PINK1 37"
## [1] "PINK1 57"
DEG = DEG %>%
select(Gene,Day,pct.1,pct.2,avg_log2FC,p_val,p_val_adj,ident.1,ident.2) %>%
arrange(Day,Gene,ident.2)
# Export
if(writeExcel){
write.xlsx(DEG,paste(output_folder,"DEG_DETAILED_MADS.xlsx",sep=""),colNames = TRUE,rowNames = FALSE)
}
# Test the effect of the corresponding Assay
DefaultAssay(S) = "RNA_imputed"
#DefaultAssay(S) = "RNA"
FeaturePlot(S,features=c("ABHD13", "VIM", "TH"))
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
# ---- CLUSTERING ----
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
DefaultAssay(S) = "integrated"
# play with the following functions - remember: https://satijalab.org/seurat/articles/pbmc3k_tutorial.html
# ?FindNeighbors
S_new = FindNeighbors(S_new, reduction = "pca", dims = 1:10)
S_new = FindClusters(S_new, resolution = 0.4)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 25655
## Number of edges: 856197
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9244
## Number of communities: 11
## Elapsed time: 4 seconds
DimPlot(S_new)
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
# ---- PLOTS TRANSCRIPTOMICS ----
# Condition WT vs ND - Day 0 to 57
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
# ---- Umap plots ----
DimPlot(object = S, reduction = "umap", group.by = "Day",shuffle = TRUE,
label = TRUE, label.size = 5,label.box = TRUE,repel=TRUE) +
theme_minimal() +ggplot2::theme(legend.position = "none") +
theme_minimal() +
theme(legend.position = "none") +
labs(x = "UMAP 1", y = "UMAP 2", title = "Umap by Day") +
theme(plot.title = element_text(hjust = 0.5))
DimPlot(object = S, reduction = "umap", group.by = "Condition",shuffle = TRUE,
label = TRUE, label.size = 5,label.box = TRUE,repel=TRUE) +
theme_minimal() +ggplot2::theme(legend.position = "none") +
theme_minimal() +
theme(legend.position = "none") +
labs(x = "UMAP 1", y = "UMAP 2", title = "Umap by Condition") +
theme(plot.title = element_text(hjust = 0.5))
DimPlot(object = S, reduction = "umap", group.by = "ConditionDay",shuffle = TRUE,
label = TRUE, label.size = 5,label.box = TRUE,repel=TRUE) +
theme_minimal() +ggplot2::theme(legend.position = "none") +
theme_minimal() +
theme(legend.position = "none") +
labs(x = "UMAP 1", y = "UMAP 2", title = "Umap by Condition and Day") +
theme(plot.title = element_text(hjust = 0.5))
DimPlot(object = S_new, reduction = "umap", group.by = "seurat_clusters",label = TRUE,shuffle = TRUE,
label.size = 5,label.box = TRUE,repel=TRUE) +
theme_minimal() +ggplot2::theme(legend.position = "none") +
theme_minimal() +
theme(legend.position = "none") +
labs(x = "UMAP 1", y = "UMAP 2", title = "Umap by Cluster") +
theme(plot.title = element_text(hjust = 0.5)) + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
DimPlot(object = S, reduction = "umap", group.by = "Day",shuffle = TRUE, split.by = "Condition",label = FALSE, label.size = 5,label.box = TRUE,repel=TRUE) +
theme_minimal() +
labs(x = "UMAP 1", y = "UMAP 2", title = "Umap by Day of Differentiation") +
theme(plot.title = element_text(hjust = 0.5)) +
# scale_fill_manual( values = rep(DayColors,1)) +
#scale_colour_manual( values = rep(DayColors,1)) +
theme(axis.line = element_line(color='black'),
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank())
# Define colors for your clusters - expanded to handle more clusters
ClusterColors <- c(
"#1f77b4", "#ff7f0e", "#2ca02c", "#d62728",
"#9467bd", "#8c564b", "#e377c2", "#7f7f7f",
"#bcbd22", "#17becf", "#aec7e8", "#ffbb78",
"#98df8a", "#ff9896", "#c5b0d5"
)
DimPlot(object = S_new, reduction = "umap", group.by = "seurat_clusters",label = TRUE,shuffle = TRUE,
label.size = 3,label.box = TRUE,repel=FALSE) +
theme_minimal() +
theme(legend.position = "none") +
labs(x = "UMAP 1", y = "UMAP 2", title = "Umap by Clusters") +
scale_fill_manual( values = ClusterColors) +
scale_colour_manual( values = ClusterColors) +
theme(axis.line = element_line(color='black'),
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank())
# ---- Differentiation markers dot-plot ----
gene_complete = c("POU5F1","L1TD1","TDGF1","OTX2","SOX2","FOXA2","LMX1A","WNT5A","DDC","ASCL1","NR4A2","DCX","PBX1")
S = SetIdent(S,value = "Condition")
DotPlot(S,
features = gene_complete,
assay = "RNA_imputed",
group.by = "Day",
dot.scale = 5,
cols = c("white", "black"),
scale.min = 0,
scale.max = 1,
col.min = 0,
col.max = 1)
pinkset <- c( "PINK1 37" , "PINK1 57" , "PINK1 18" , "PINK1 25" , "PINK1 0")
DotPlot(subset(S,ConditionDay %in% pinkset),
features = gene_complete,
assay = "RNA_imputed",
group.by = "Day",
dot.scale = 5,
cols = c("white", "black"),
scale.min = 0,
scale.max = 1,
col.min = 0,
col.max = 1)
# ---- Differentiation markers heatmap ----
DefaultAssay(S) = "RNA_imputed"
S = ScaleData(S,assay = "RNA_imputed")
S = SetIdent(object = S, value = S$ConditionDay)
DoHeatmap(S,features = c("MYC","POU5F1","L1TD1","TDGF1","POLR3G","TERF1","FABP7","FOXA2","OTX2","SOX2","SOX6","WNT5A","LMX1A","ASCL1","DDC","NR4A2","DCX","PBX1","KCNJ6"),
slot = "scale.data", draw.lines = FALSE, raster = FALSE) + scale_fill_viridis()
############## for DEG Vulcano plots
volc = ggplot(DEG, aes(- avg_log2FC, -log(p_val_adj))) +
geom_point(color="grey", size=1) #+
# geom_point(data = DEG[vec_gene_down,],aes(- avg_log2FC, -log(p_val_adj)), color="red") +
#geom_point(data = DEG[vec_gene_up,],aes(- avg_log2FC, -log(p_val_adj)), color="blue")
volc + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), axis.line = element_line(colour = "black"))
# For pathway and processes analysis
# This should be done daywise and cluster wise adressing the questions:
# what are Pathways, Molecular functions and Cellular components that chenge over time and between conditions
# GO Term enrichment using
library(gprofiler2)
#get a list of downregulated genes and then run GO enrichment
ranked_genes_fc <- setNames( DEG$avg_log2FC, DEG$Gene) # remember - how is FC defined in DEG: FC = log(cond1/cond2)
ranked_genes_fc <- sort(ranked_genes_fc, decreasing = TRUE)
gene_down <- names(ranked_genes_fc) # down in PINK1 means up in CTL: FC = CTL/PINK1
gost_res_down <- gost(gene_down,
organism = "hsapiens", # Specify organism (hsapiens = human)
ordered_query = FALSE,
significant = TRUE,
user_threshold = 0.05, # FDR threshold for significance
correction_method = "fdr") # Multiple testing correction method
# View the results
head(gost_res_down$result)
## query significant p_value term_size query_size intersection_size
## 1 query_1 TRUE 6.288546e-11 138 1796 115
## 2 query_1 TRUE 6.968468e-07 78 1796 67
## 3 query_1 TRUE 4.429716e-05 48 1796 43
## 4 query_1 TRUE 5.838506e-05 77 1796 63
## 5 query_1 TRUE 4.945187e-02 33 1796 28
## 6 query_1 TRUE 4.945187e-02 30 1796 26
## precision recall term_id source term_name
## 1 0.06403118 0.8333333 CORUM:351 CORUM Spliceosome
## 2 0.03730512 0.8589744 CORUM:320 CORUM 55S ribosome, mitochondrial
## 3 0.02394209 0.8958333 CORUM:324 CORUM 39S ribosomal subunit, mitochondrial
## 4 0.03507795 0.8181818 CORUM:1181 CORUM C complex spliceosome
## 5 0.01559020 0.8484848 CORUM:193 CORUM PA700-20S-PA28 complex
## 6 0.01447661 0.8666667 CORUM:2755 CORUM 17S U2 snRNP
## effective_domain_size source_order parents
## 1 3383 205 CORUM:0000000
## 2 3383 191 CORUM:0000000
## 3 3383 193 CORUM:0000000
## 4 3383 659 CORUM:0000000
## 5 3383 98 CORUM:0000000
## 6 3383 1174 CORUM:0000000
top_pathways_down <- gost_res_down$result[order(gost_res_down$result$p_value), ][1:20, ]
ggplot(top_pathways_down, aes(x = reorder(term_name, -p_value), y = -log10(p_value))) +
geom_point(aes(size = intersection_size, color = -log10(p_value))) + # Size based on number of DEGs in the pathway
scale_color_gradient(low = "blue", high = "red") + # Color gradient based on p-value
theme_minimal() +
coord_flip() + # Flip coordinates for better readability
labs(title = "Top 20 Enriched Pathways",
x = "Pathway",
y = "-log10(p-value)",
size = "Gene Count",
color = "-log10(p-value)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
#here you can then look/plot also into other / lower ranks by changing the numbers in the [] like for 21:40
top_pathways_down_2 <- gost_res_down$result[order(gost_res_down$result$p_value), ][21:40, ]
ggplot(top_pathways_down_2, aes(x = reorder(term_name, -p_value), y = -log10(p_value))) +
geom_point(aes(size = intersection_size, color = -log10(p_value))) + # Size based on number of DEGs in the pathway
scale_color_gradient(low = "blue", high = "red") + # Color gradient based on p-value
theme_minimal() +
coord_flip() + # Flip coordinates for better readability
labs(title = "Top 21:40 Enriched Pathways",
x = "Pathway",
y = "-log10(p-value)",
size = "Gene Count",
color = "-log10(p-value)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# you can also make your specific list by defining an index vector
go_select <- c( 23:25, 31, 40)
top_pathways_down_sel <- gost_res_down$result[order(gost_res_down$result$p_value), ][go_select, ]
ggplot(top_pathways_down_sel, aes(x = reorder(term_name, -p_value), y = -log10(p_value))) +
geom_point(aes(size = intersection_size, color = -log10(p_value))) + # Size based on number of DEGs in the pathway
scale_color_gradient(low = "blue", high = "red") + # Color gradient based on p-value
theme_minimal() +
coord_flip() + # Flip coordinates for better readability
labs(title = "Top 21:40 Enriched Pathways",
x = "Pathway",
y = "-log10(p-value)",
size = "Gene Count",
color = "-log10(p-value)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# TO export the resukts and screen through pathways:
#install.packages("openxlsx")
library(openxlsx)
toexport <- gost_res_down$result
write.xlsx(toexport, file = "gprofiler_go_results_down.xlsx")
# bar plots for significance (again you can select the pathways you want to show by defining the variable top_pathways_down - see above):
ggplot(top_pathways_down, aes(x = reorder(term_name, -p_value), y = -log10(p_value))) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() + # Flip the coordinates for better readability
theme_minimal() +
labs(title = "Top 20 Enriched Pathways by Significance",
x = "Pathway",
y = "-log10(p-value)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
NOW You can do this for upregulated genes … and filter this per day etc …
sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Red Hat Enterprise Linux 8.10 (Ootpa)
##
## Matrix products: default
## BLAS/LAPACK: FlexiBLAS OPENBLAS; LAPACK version 3.11.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Europe/Luxembourg
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] gprofiler2_0.2.4 future_1.33.2 openxlsx_4.2.5.2 viridis_0.6.5
## [5] viridisLite_0.4.2 lubridate_1.9.3 forcats_1.0.0 stringr_1.5.1
## [9] purrr_1.0.2 readr_2.1.5 tidyr_1.3.1 tibble_3.2.1
## [13] ggplot2_3.5.1 tidyverse_2.0.0 dplyr_1.1.4 Seurat_5.4.0
## [17] SeuratObject_5.3.0 sp_2.1-4
##
## loaded via a namespace (and not attached):
## [1] RColorBrewer_1.1-3 jsonlite_1.8.8 magrittr_2.0.3
## [4] spatstat.utils_3.0-5 farver_2.1.2 rmarkdown_2.27
## [7] vctrs_0.6.5 ROCR_1.0-11 spatstat.explore_3.2-7
## [10] RCurl_1.98-1.14 htmltools_0.5.8.1 curl_5.2.1
## [13] sass_0.4.9 sctransform_0.4.1 parallelly_1.37.1
## [16] KernSmooth_2.23-24 bslib_0.7.0 htmlwidgets_1.6.4
## [19] ica_1.0-3 plyr_1.8.9 plotly_4.10.4
## [22] zoo_1.8-12 cachem_1.1.0 igraph_2.0.3
## [25] mime_0.12 lifecycle_1.0.4 pkgconfig_2.0.3
## [28] Matrix_1.7-4 R6_2.5.1 fastmap_1.2.0
## [31] fitdistrplus_1.1-11 shiny_1.8.1.1 digest_0.6.36
## [34] colorspace_2.1-0 patchwork_1.2.0 tensor_1.5
## [37] RSpectra_0.16-1 irlba_2.3.5.1 labeling_0.4.3
## [40] progressr_0.14.0 fansi_1.0.6 spatstat.sparse_3.1-0
## [43] timechange_0.3.0 httr_1.4.7 polyclip_1.10-6
## [46] abind_1.4-5 compiler_4.4.1 withr_3.0.0
## [49] fastDummies_1.7.3 highr_0.11 MASS_7.3-61
## [52] tools_4.4.1 lmtest_0.9-40 zip_2.3.1
## [55] httpuv_1.6.15 future.apply_1.11.2 goftest_1.2-3
## [58] glue_1.7.0 nlme_3.1-165 promises_1.3.0
## [61] grid_4.4.1 Rtsne_0.17 cluster_2.1.6
## [64] reshape2_1.4.4 generics_0.1.3 gtable_0.3.5
## [67] spatstat.data_3.1-2 tzdb_0.4.0 data.table_1.15.4
## [70] hms_1.1.3 utf8_1.2.4 spatstat.geom_3.2-9
## [73] RcppAnnoy_0.0.22 ggrepel_0.9.5 RANN_2.6.1
## [76] pillar_1.9.0 spam_2.10-0 RcppHNSW_0.6.0
## [79] later_1.3.2 splines_4.4.1 lattice_0.22-6
## [82] survival_3.7-0 deldir_2.0-4 tidyselect_1.2.1
## [85] miniUI_0.1.1.1 pbapply_1.7-2 knitr_1.47
## [88] gridExtra_2.3 scattermore_1.2 xfun_0.45
## [91] matrixStats_1.3.0 stringi_1.8.4 lazyeval_0.2.2
## [94] yaml_2.3.8 evaluate_0.24.0 codetools_0.2-20
## [97] cli_3.6.3 uwot_0.2.4 xtable_1.8-4
## [100] reticulate_1.38.0 munsell_0.5.1 jquerylib_0.1.4
## [103] Rcpp_1.1.0 globals_0.16.3 spatstat.random_3.2-3
## [106] png_0.1-8 parallel_4.4.1 dotCall64_1.1-1
## [109] bitops_1.0-7 listenv_0.9.1 scales_1.3.0
## [112] ggridges_0.5.6 rlang_1.1.4 cowplot_1.1.3